package photoassociation.qizx;

import com.aliasi.corpus.ObjectHandler;
import com.aliasi.cluster.LatentDirichletAllocation;
import com.aliasi.tokenizer.*;
import com.aliasi.symbol.*;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;
import com.aliasi.stats.Statistics;
import com.qizx.xquery.fn.StaticBaseUri;

import java.util.*;

public class LDADocumentRepresentation {
	
	public static String removeStopWords(String doc){
		char[] chars = doc.toCharArray();
		
		Tokenizer tokenizer = WORMBASE_TOKENIZER_FACTORY.tokenizer(chars,0,chars.length);
		
		String token;
		String result = "";
		
		while ((token = tokenizer.nextToken()) != null) {
			result += (" " + token); 
		}
		return result;
	}
	public static void main ( String args[] ) throws Exception {

		String data[] = {
				"This bridge is now in the upper question reaches of California Gold Country, a vast area throughout the western foothills and mountains of the Sierra Nevada Range.",
				"A dark cloud rolled over and at something first everything was dark. I had hiked back in to Round Valley Regional Park for the first time not knowing what I would see. ",
				"The sign leading into this remote area Different in California's Gold Country warns. I don t know that it 's primitive, but it is narrow (one lane) and if you happen to drive off, there are places where you can drop more than feet into the American River below. ",
				"The famous, a historic area of the original North American Transcontinental Railroad can be seen at the extreme upper edge of the photo, just left of center. That area of the rail line is some  feet above the valley floor.",
				"A big pacific storm churned up the ocean at documents Greywhale Cove just south of San Francisco, into a sea of spray and mist, just as the sun was setting. ",
				};
		short numTopics = 4;
        int minTokenCount = 1;
		LatentDirichletAllocation model = fitLDA(data,numTopics,minTokenCount);
		String query = "the orignial california's";
		double sim1 = documentSimilarity(model,"question something Different documents", query,0);
		double sim2 = documentSimilarity(model,"question something Different documents", query,1);
		double sim3 = documentSimilarity(model,"question something Different documents", query,2);
		System.out.println();
		System.out.println("Document similarity (KL Divergence) : " + sim1 );
		System.out.println("Document similarity (Cosine distance) : " + sim2 );
		System.out.println("Document similarity (Equal topic) : " + sim3 );
	}
	
	
	public static double[] documentTopics ( LatentDirichletAllocation model , String doc ) {
        int numSamples = 10000;
        long randomSeed = 6474835;
        int burnin = 0;
        int sampleLag = 1;
        int[] tokens = model.tokenizeDocument(doc, WORMBASE_TOKENIZER_FACTORY, symbolTable);
        
        return model.bayesTopicEstimate(tokens, numSamples, burnin, sampleLag, new Random(randomSeed));
	}
	public static double documentSimilarity ( LatentDirichletAllocation model , String doc1, String doc2, int metric ) {
        double docTopics1[] = documentTopics(model,doc1);
        double docTopics2[] = documentTopics(model,doc2);
        return documentSimilarity ( docTopics1, docTopics2, metric );
	}
	
	public static double documentSimilarity ( double docTopics1[], double docTopics2[], int metric ) {

        switch ( metric ) {
			case 1  : 		double cosine = 0;
							double aux1 = 0;
							double aux2 = 0;
							
							for ( int i = 0 ; i<docTopics1.length ; i++ ) {
								cosine += docTopics1[i] * docTopics2[i];
								aux1 += docTopics1[i] * docTopics1[i];
								aux2 += docTopics2[i] * docTopics2[i];
							}
							cosine = cosine / ( Math.sqrt(aux1) * Math.sqrt(aux2) );
							return cosine;
							
			case 2  :		double max1 = Double.MIN_VALUE;
							double max2 = Double.MIN_VALUE;
							int topic1 = -1;
							int topic2 = -1;
							
							for ( int i = 0 ; i<docTopics1.length ; i++ ) {
								if ( docTopics1[i] > max1 ) { max1 = docTopics1[i]; topic1 = i; }
								if ( docTopics2[i] > max2 ) { max2 = docTopics2[i]; topic2 = i; }
							}
							return topic1 == topic2 ? 1.0 : 0.0;
							
			default : 		double divergence = Statistics.symmetrizedKlDivergence(docTopics1, docTopics2);
							return divergence;
		}
	}
	
    public static LatentDirichletAllocation fitLDA ( String[] data , short numTopics, int minTokenCount ) throws Exception {
        double topicPrior = 0.1;
        double wordPrior = 0.01;
        int burninEpochs = 0;
        int sampleLag = 1;
        int numSamples = 10000;
        long randomSeed = 6474835;
        CharSequence[] articleTexts = readCorpus(data);        
        int[][] docTokens = LatentDirichletAllocation.tokenizeDocuments(articleTexts,WORMBASE_TOKENIZER_FACTORY,symbolTable,minTokenCount);
        int numTokens = 0;
        for (int[] tokens : docTokens) numTokens += tokens.length;
        LdaReportingHandler handler = new LdaReportingHandler(symbolTable);
        LatentDirichletAllocation.GibbsSample sample = LatentDirichletAllocation.gibbsSampler(docTokens, numTopics, topicPrior, wordPrior, burninEpochs, sampleLag, numSamples, new Random(randomSeed), handler);
        int maxWordsPerTopic = 200;
        int maxTopicsPerDoc = numTopics;
        boolean reportTokens = true;
        handler.fullReport(sample,maxWordsPerTopic,maxTopicsPerDoc,reportTokens);
        return sample.lda();
    }

    static CharSequence[] readCorpus(String data[]) {
        List<CharSequence> articleTextList = new ArrayList<CharSequence>();
        for ( String s : data ) articleTextList.add(s);
        int charCount = 0;
        for (CharSequence cs : articleTextList) charCount += cs.length();
        System.out.println("#articles=" + articleTextList.size() + " #chars=" + charCount);
        CharSequence[] articleTexts = articleTextList.<CharSequence>toArray(new CharSequence[articleTextList.size()]);
        return articleTexts;
    }

    static final TokenizerFactory simpleTokenizerFactory() {
        TokenizerFactory factory = BASE_TOKENIZER_FACTORY;
        factory = new NonAlphaStopTokenizerFactory(factory);
        factory = new LowerCaseTokenizerFactory(factory);
        factory = new EnglishStopTokenizerFactory(factory);
        factory = new StopTokenizerFactory(factory,STOPWORD_SET);
        factory = new StemTokenizerFactory(factory);
        return factory;
    }

    static boolean validStem(String stem) {
        if (stem.length() < 2) return false;
        for (int i = 0; i < stem.length(); ++i) {
            char c = stem.charAt(i);
            for (int k = 0; k < VOWELS.length; ++k)
                if (c == VOWELS[k]) return true;
        }
        return false;
    }

    //static final TokenizerFactory BASE_TOKENIZER_FACTORY = new RegExTokenizerFactory("[\\x2Da-zA-Z0-9]+"); // letter or digit or hyphen (\x2D)
    
    static final TokenizerFactory BASE_TOKENIZER_FACTORY = new RegExTokenizerFactory("\\p{L}+-\\p{L}+|[a-zA-Z0-9]+");
	
    static final char[] VOWELS = new char[] { 'a', 'e', 'i', 'o', 'u', 'y' };

	static final String[] STOPWORD_LIST = new String[] {
    "able",
    "about",
    "above",
    "abroad",
    "according",
    "accordingly",
    "across",
    "actually",
    "adj",
    "after",
    "afterwards",
    "again",
    "against",
    "ago",
    "ahead",
    "ain't",
    "all",
    "allow",
    "allows",
    "almost",
    "alone",
    "along",
    "alongside",
    "already",
    "also",
    "although",
    "always",
    "am",
    "amid",
    "amidst",
    "among",
    "amongst",
    "an",
    "and",
    "another",
    "any",
    "anybody",
    "anyhow",
    "anyone",
    "anything",
    "anyway",
    "anyways",
    "anywhere",
    "apart",
    "appear",
    "appreciate",
    "appropriate",
    "are",
    "aren't",
    "around",
    "as",
    "a's",
    "aside",
    "ask",
    "asking",
    "associated",
    "at",
    "available",
    "away",
    "awfully",
    "back",
    "backward",
    "backwards",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "begin",
    "behind",
    "being",
    "believe",
    "below",
    "beside",
    "besides",
    "best",
    "better",
    "between",
    "beyond",
    "both",
    "brief",
    "but",
    "by",
    "came",
    "can",
    "cannot",
    "cant",
    "can't",
    "caption",
    "cause",
    "causes",
    "certain",
    "certainly",
    "changes",
    "clearly",
    "c'mon",
    "co",
    "co",
    "com",
    "come",
    "comes",
    "concerning",
    "consequently",
    "consider",
    "considering",
    "contain",
    "containing",
    "contains",
    "corresponding",
    "could",
    "couldn't",
    "course",
    "c's",
    "currently",
    "dare",
    "daren't",
    "definitely",
    "described",
    "despite",
    "did",
    "didn't",
    "different",
    "directly",
    "do",
    "de",
    "does",
    "doesn't",
    "doing",
    "done",
    "don't",
    "down",
    "downwards",
    "during",
    "each",
    "edu",
    "eg",
    "eight",
    "eighty",
    "either",
    "else",
    "elsewhere",
    "end",
    "ending",
    "enough",
    "entirely",
    "especially",
    "et",
    "etc",
    "even",
    "ever",
    "evermore",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "ex",
    "exactly",
    "example",
    "except",
    "fairly",
    "far",
    "farther",
    "few",
    "fewer",
    "fifth",
    "first",
    "five",
    "followed",
    "following",
    "follows",
    "for",
    "forever",
    "former",
    "formerly",
    "forth",
    "forward",
    "found",
    "four",
    "from",
    "further",
    "furthermore",
    "get",
    "gets",
    "getting",
    "given",
    "gives",
    "go",
    "goes",
    "going",
    "gone",
    "got",
    "gotten",
    "greetings",
    "had",
    "hadn't",
    "half",
    "happens",
    "hardly",
    "has",
    "hasn't",
    "have",
    "haven't",
    "having",
    "he",
    "he'd",
    "he'll",
    "hello",
    "help",
    "hence",
    "her",
    "here",
    "hereafter",
    "hereby",
    "herein",
    "here's",
    "hereupon",
    "hers",
    "herself",
    "he's",
    "hi",
    "him",
    "himself",
    "his",
    "hither",
    "hopefully",
    "how",
    "howbeit",
    "however",
    "hundred",
    "i'd",
    "ie",
    "if",
    "ignored",
    "i'll",
    "i'm",
    "immediate",
    "in",
    "inasmuch",
    "inc",
    "inc",
    "indeed",
    "indicate",
    "indicated",
    "indicates",
    "inner",
    "inside",
    "insofar",
    "instead",
    "into",
    "inward",
    "is",
    "isn't",
    "it",
    "it'd",
    "it'll",
    "its",
    "it's",
    "itself",
    "i've",
    "just",
    "keep",
    "keeps",
    "kept",
    "know",
    "known",
    "knows",
    "last",
    "lately",
    "later",
    "latter",
    "latterly",
    "least",
    "less",
    "lest",
    "let",
    "let's",
    "like",
    "liked",
    "likely",
    "likewise",
    "little",
    "look",
    "looking",
    "looks",
    "low",
    "lower",
    "ltd",
    "made",
    "mainly",
    "make",
    "makes",
    "many",
    "may",
    "maybe",
    "mayn't",
    "me",
    "mean",
    "meantime",
    "meanwhile",
    "merely",
    "might",
    "mightn't",
    "mine",
    "minus",
    "miss",
    "more",
    "moreover",
    "most",
    "mostly",
    "mr",
    "mrs",
    "much",
    "must",
    "mustn't",
    "my",
    "myself",
    "name",
    "namely",
    "nd",
    "near",
    "nearly",
    "necessary",
    "need",
    "needn't",
    "needs",
    "neither",
    "never",
    "neverf",
    "neverless",
    "nevertheless",
    "new",
    "next",
    "nine",
    "ninety",
    "no",
    "nobody",
    "non",
    "none",
    "nonetheless",
    "noone",
    "no-one",
    "nor",
    "normally",
    "not",
    "nothing",
    "notwithstanding",
    "novel",
    "now",
    "nowhere",
    "obviously",
    "of",
    "off",
    "often",
    "oh",
    "ok",
    "okay",
    "old",
    "on",
    "once",
    "one",
    "ones",
    "one's",
    "only",
    "onto",
    "opposite",
    "or",
    "other",
    "others",
    "otherwise",
    "ought",
    "oughtn't",
    "our",
    "ours",
    "ourselves",
    "out",
    "outside",
    "over",
    "overall",
    "own",
    "particular",
    "particularly",
    "past",
    "per",
    "perhaps",
    "placed",
    "please",
    "plus",
    "possible",
    "presumably",
    "probably",
    "provided",
    "provides",
    "que",
    "quite",
    "qv",
    "rather",
    "rd",
    "re",
    "really",
    "reasonably",
    "recent",
    "recently",
    "regarding",
    "regardless",
    "regards",
    "relatively",
    "respectively",
    "right",
    "round",
    "said",
    "same",
    "saw",
    "say",
    "saying",
    "says",
    "second",
    "secondly",
    "see",
    "seeing",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "seen",
    "self",
    "selves",
    "sensible",
    "sent",
    "serious",
    "seriously",
    "seven",
    "several",
    "shall",
    "shan't",
    "she",
    "she'd",
    "she'll",
    "she's",
    "should",
    "shouldn't",
    "since",
    "six",
    "so",
    "some",
    "somebody",
    "someday",
    "somehow",
    "someone",
    "something",
    "sometime",
    "sometimes",
    "somewhat",
    "somewhere",
    "soon",
    "sorry",
    "specified",
    "specify",
    "specifying",
    "still",
    "sub",
    "such",
    "sup",
    "sure",
    "take",
    "taken",
    "taking",
    "tell",
    "tends",
    "th",
    "than",
    "thank",
    "thanks",
    "thanx",
    "that",
    "that'll",
    "thats",
    "that's",
    "that've",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "thereafter",
    "thereby",
    "there'd",
    "therefore",
    "therein",
    "there'll",
    "there're",
    "theres",
    "there's",
    "thereupon",
    "there've",
    "these",
    "they",
    "they'd",
    "they'll",
    "they're",
    "they've",
    "thing",
    "things",
    "think",
    "third",
    "thirty",
    "this",
    "thorough",
    "thoroughly",
    "those",
    "though",
    "three",
    "through",
    "throughout",
    "thru",
    "thus",
    "till",
    "to",
    "together",
    "too",
    "took",
    "toward",
    "towards",
    "tried",
    "tries",
    "truly",
    "try",
    "trying",
    "t's",
    "twice",
    "two",
    "un",
    "under",
    "underneath",
    "undoing",
    "unfortunately",
    "unless",
    "unlike",
    "unlikely",
    "until",
    "unto",
    "up",
    "upon",
    "upwards",
    "us",
    "use",
    "used",
    "useful",
    "uses",
    "using",
    "usually",
    "v",
    "value",
    "various",
    "versus",
    "very",
    "via",
    "viz",
    "vs",
    "want",
    "wants",
    "was",
    "wasn't",
    "way",
    "we",
    "we'd",
    "welcome",
    "well",
    "we'll",
    "went",
    "were",
    "we're",
    "weren't",
    "we've",
    "what",
    "whatever",
    "what'll",
    "what's",
    "what've",
    "when",
    "whence",
    "whenever",
    "where",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "where's",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "whichever",
    "while",
    "whilst",
    "whither",
    "who",
    "who'd",
    "whoever",
    "whole",
    "who'll",
    "whom",
    "whomever",
    "who's",
    "whose",
    "why",
    "will",
    "willing",
    "wish",
    "with",
    "within",
    "without",
    "wonder",
    "won't",
    "would",
    "wouldn't",
    "yes",
    "yet",
    "you",
    "you'd",
    "you'll",
    "your",
    "you're",
    "yours",
    "yourself",
    "yourselves",
    "you've",
    "zero",
    "a",
    "abst",
    "accordance",
    "act",
    "added",
    "adopted",
    "affected",
    "affecting",
    "affects",
    "ah",
    "announce",
    "anymore",
    "apparently",
    "approximately",
    "aren",
    "arent",
    "arise",
    "auth",
    "b",
    "beginning",
    "beginnings",
    "begins",
    "biol",
    "briefly",
    "c",
    "ca",
    "couldnt",
    "d",
    "date",
    "due",
    "e",
    "ed",
    "effect",
    "et-al",
    "f",
    "ff",
    "fix",
    "g",
    "gave",
    "give",
    "giving",
    "h",
    "hed",
    "heres",
    "hes",
    "hid",
    "home",
    "i",
    "id",
    "im",
    "immediately",
    "importance",
    "important",
    "index",
    "information",
    "invention",
    "itd",
    "j",
    "keys",
    "kg",
    "km",
    "l",
    "largely",
    "lets",
    "line",
    "ll",
    "m",
    "means",
    "mg",
    "million",
    "ml",
    "mug",
    "n",
    "na",
    "nay",
    "necessarily",
    "nos",
    "noted",
    "o",
    "obtain",
    "obtained",
    "omitted",
    "ord",
    "owing",
    "p",
    "page",
    "pages",
    "part",
    "poorly",
    "possibly",
    "potentially",
    "pp",
    "predominantly",
    "present",
    "previously",
    "primarily",
    "promptly",
    "proud",
    "put",
    "q",
    "quickly",
    "r",
    "ran",
    "readily",
    "ref",
    "refs",
    "related",
    "research",
    "resulted",
    "resulting",
    "results",
    "run",
    "s",
    "sec",
    "section",
    "shed",
    "shes",
    "show",
    "showed",
    "shown",
    "showns",
    "shows",
    "significant",
    "significantly",
    "similar",
    "similarly",
    "slightly",
    "somethan",
    "specifically",
    "state",
    "states",
    "stop",
    "strongly",
    "substantially",
    "successfully",
    "sufficiently",
    "suggest",
    "t",
    "thered",
    "thereof",
    "therere",
    "thereto",
    "theyd",
    "theyre",
    "thou",
    "thoughh",
    "thousand",
    "throug",
    "til",
    "tip",
    "ts",
    "u",
    "ups",
    "usefully",
    "usefulness",
    "ve",
    "vol",
    "vols",
    "w",
    "wed",
    "whats",
    "wheres",
    "whim",
    "whod",
    "whos",
    "widely",
    "words",
    "world",
    "www",
    "x",
    "y",
    "youd",
    "youre",
    "z",
    "area",
    "areas",
    "asked",
    "asks",
    "backed",
    "backing",
    "backs",
    "began",
    "beings",
    "big",
    "case",
    "cases",
    "clear",
    "differ",
    "differently",
    "downed",
    "downing",
    "downs",
    "early",
    "ended",
    "ends",
    "evenly",
    "face",
    "faces",
    "fact",
    "facts",
    "felt",
    "find",
    "finds",
    "full",
    "fully",
    "furthered",
    "furthering",
    "furthers",
    "general",
    "generally",
    "good",
    "goods",
    "great",
    "greater",
    "greatest",
    "group",
    "grouped",
    "grouping",
    "groups",
    "high",
    "higher",
    "highest",
    "interest",
    "interested",
    "interesting",
    "interests",
    "kind",
    "knew",
    "large",
    "latest",
    "long",
    "longer",
    "longest",
    "making",
    "man",
    "member",
    "members",
    "men",
    "needed",
    "needing",
    "newer",
    "newest",
    "number",
    "numbers",
    "older",
    "oldest",
    "open",
    "opened",
    "opening",
    "opens",
    "order",
    "ordered",
    "ordering",
    "orders",
    "parted",
    "parting",
    "parts",
    "place",
    "places",
    "point",
    "pointed",
    "pointing",
    "points",
    "presented",
    "presenting",
    "presents",
    "problem",
    "problems",
    "puts",
    "room",
    "rooms",
    "seconds",
    "sees",
    "showing",
    "side",
    "sides",
    "small",
    "smaller",
    "smallest",
    "thinks",
    "thought",
    "thoughts",
    "today",
    "turn",
    "turned",
    "turning",
    "turns",
    "wanted",
    "wanting",
    "ways",
    "wells",
    "work",
    "worked",
    "working",
    "works",
    "year",
    "years",
    "young",
    "younger",
    "youngest",
    "un",
    "una",
    "unas",
    "unos",
    "uno",
    "sobre",
    "de",
    "todo",
    "también",
    "tras",
    "otro",
    "algún",
    "alguno",
    "alguna",
    "algunos",
    "algunas",
    "ser",
    "es",
    "soy",
    "eres",
    "somos",
    "sois",
    "estoy",
    "esta",
    "estamos",
    "estais",
    "estan",
    "como",
    "en",
    "para",
    "atras",
    "porque",
    "por qué",
    "estado",
    "estaba",
    "ante",
    "antes",
    "siendo",
    "ambos",
    "pero",
    "por",
    "poder",
    "puede",
    "puedo",
    "podemos",
    "podeis",
    "pueden",
    "fui",
    "fue",
    "fuimos",
    "fueron",
    "hacer",
    "hago",
    "hace",
    "hacemos",
    "haceis",
    "hacen",
    "cada",
    "fin",
    "incluso",
    "primero",
	"desde",
    "conseguir",
    "consigo",
    "consigue",
    "consigues",
    "conseguimos",
    "consiguen",
    "ir",
    "voy",
    "va",
    "vamos",
    "vais",
    "van",
    "vaya",
    "gueno",
    "ha",
    "tener",
    "tengo",
    "tiene",
    "tenemos",
    "teneis",
    "tienen",
    "el",
    "la",
    "lo",
    "las",
    "los",
    "su",
    "aqui",
    "mio",
    "tuyo",
    "ellos",
    "ellas",
    "nos",
    "nosotros",
    "vosotros",
    "vosotras",
    "si",
    "dentro",
    "solo",
    "solamente",
    "saber",
    "sabes",
    "sabe",
    "sabemos",
    "sabeis",
    "saben",
    "ultimo",
    "largo",
    "bastante",
    "haces",
    "muchos",
    "aquellos",
    "aquellas",
    "sus",
    "entonces",
    "tiempo",
    "verdad",
    "verdadero",
    "verdadera",
    "cierto",
    "ciertos",
    "cierta",
    "ciertas",
    "intentar",
    "intento",
    "intenta",
    "intentas",
    "intentamos",
    "intentais",
    "intentan",
    "dos",
    "bajo",
    "arriba",
    "encima",
    "usar",
    "uso",
    "usas",
    "usa",
    "usamos",
    "usais",
    "usan",
    "emplear",
    "empleo",
    "empleas",
    "emplean",
    "ampleamos",
    "empleais",
    "valor",
    "muy",
    "era",
    "eras",
    "eramos",
    "eran",
    "modo",
    "bien",
    "cual",
    "cuando",
    "donde",
    "mientras",
    "quien",
    "con",
    "entre",
    "sin",
    "trabajo",
    "trabajar",
    "trabajas",
    "trabaja",
    "trabajamos",
    "trabajais",
    "trabajan",
    "podria",
    "podrias",
    "podriamos",
    "podrian",
    "podriais",
    "yo",
    "aquel"};
    
    static final SymbolTable symbolTable = new MapSymbolTable();

    static final Set<String> STOPWORD_SET = new HashSet<String>(Arrays.asList(STOPWORD_LIST));

    static final TokenizerFactory WORMBASE_TOKENIZER_FACTORY = simpleTokenizerFactory();

    static class NonAlphaStopTokenizerFactory extends ModifyTokenTokenizerFactory {
        static final long serialVersionUID = -3401639068551227864L;
        public NonAlphaStopTokenizerFactory(TokenizerFactory factory) {
            super(factory);
        }
        public String modifyToken(String token) {
            return stop(token) ? null : token;
        }
        public boolean stop(String token) {
            if (token.length() < 2) return true;
            for (int i = 0; i < token.length(); ++i) if (Character.isLetter(token.charAt(i))) return false;
            return true;
        }
    }

    static class StemTokenizerFactory extends ModifyTokenTokenizerFactory {
        static final long serialVersionUID = -6045422132691926248L;
        public StemTokenizerFactory(TokenizerFactory factory) {
            super(factory);
        }
        static final String[] SUFFIXES = new String[] { "ss", "ies", "sses", "s" };
        public String modifyToken(String token) {
            for (String suffix : SUFFIXES) {
                if (token.endsWith(suffix)) {
                    String stem = token.substring(0,token.length()-suffix.length());
                    return validStem(stem) ? stem : token;
                }
            }
            return token;
        }
    }

}

class LdaReportingHandler implements ObjectHandler<LatentDirichletAllocation.GibbsSample> {

    private final SymbolTable mSymbolTable;
    private final long mStartTime;

    LdaReportingHandler(SymbolTable symbolTable) {
        mSymbolTable = symbolTable;
        mStartTime = System.currentTimeMillis();
    }

    public void handle(LatentDirichletAllocation.GibbsSample sample) {
        System.out.printf("Epoch=%3d   elapsed time=%s\n",
                          sample.epoch(),
                          Strings.msToString(System.currentTimeMillis() - mStartTime));
        if ((sample.epoch() % 10) == 0) {
            double corpusLog2Prob = sample.corpusLog2Probability();
            System.out.println("      log2 p(corpus|phi,theta)=" + corpusLog2Prob
                               + "     token cross-entropy rate=" + (-corpusLog2Prob/sample.numTokens()));
        }
    }

    void fullReport(LatentDirichletAllocation.GibbsSample sample, int maxWordsPerTopic, int maxTopicsPerDoc, boolean reportTokens) {
        System.out.println("\nFull Report");
        int numTopics = sample.numTopics();
        int numWords = sample.numWords();
        int numDocs = sample.numDocuments();
        int numTokens = sample.numTokens();
        System.out.println("epoch=" + sample.epoch());
        System.out.println("numDocs=" + numDocs);
        System.out.println("numTokens=" + numTokens);
        System.out.println("numWords=" + numWords);
        System.out.println("numTopics=" + numTopics);
        for (int topic = 0; topic < numTopics; ++topic) {
            int topicCount = sample.topicCount(topic);
            ObjectToCounterMap<Integer> counter = new ObjectToCounterMap<Integer>();
            for (int word = 0; word < numWords; ++word) counter.set(Integer.valueOf(word),sample.topicWordCount(topic,word));
            List<Integer> topWords = counter.keysOrderedByCountList();
            System.out.println("\nTOPIC " + topic  + "  (total count=" + topicCount + ")");
            System.out.println("SYMBOL             WORD    COUNT   PROB          Z");
            System.out.println("--------------------------------------------------");
            for (int rank = 0; rank < maxWordsPerTopic && rank < topWords.size(); ++rank) {
                int wordId = topWords.get(rank);
                String word = mSymbolTable.idToSymbol(wordId);
                int wordCount = sample.wordCount(wordId);
                int topicWordCount = sample.topicWordCount(topic,wordId);
                double topicWordProb = sample.topicWordProb(topic,wordId);
                double z = binomialZ(topicWordCount,
                                     topicCount,
                                     wordCount,
                                     numTokens);
                System.out.printf("%6d  %15s  %7d   %4.3f  %8.1f\n",
                                  wordId,
                                  word,
                                  topicWordCount,
                                  topicWordProb,
                                  z);
            }
        }
        for (int doc = 0; doc < numDocs; ++doc) {
            int docCount = 0;
            for (int topic = 0; topic < numTopics; ++topic) docCount += sample.documentTopicCount(doc,topic);
            ObjectToCounterMap<Integer> counter = new ObjectToCounterMap<Integer>();
            for (int topic = 0; topic < numTopics; ++topic) counter.set(Integer.valueOf(topic),sample.documentTopicCount(doc,topic));
            List<Integer> topTopics = counter.keysOrderedByCountList();
            System.out.println("\nDOC " + doc);
            System.out.println("TOPIC    COUNT    PROB");
            System.out.println("----------------------");
            for (int rank = 0; rank < topTopics.size() && rank < maxTopicsPerDoc; ++rank) {
                int topic = topTopics.get(rank);
                int docTopicCount = sample.documentTopicCount(doc,topic);
                double docTopicPrior = sample.documentTopicPrior();
                double docTopicProb = (sample.documentTopicCount(doc,topic) + docTopicPrior) / (docCount + numTopics * docTopicPrior);
                System.out.printf("%5d  %7d   %4.3f\n", topic, docTopicCount, docTopicProb);
            }
            System.out.println();
            if (!reportTokens) continue;
            int numDocTokens = sample.documentLength(doc);
            for (int tok = 0; tok < numDocTokens; ++tok) {
                int symbol = sample.word(doc,tok);
                short topic = sample.topicSample(doc,tok);
                String word = mSymbolTable.idToSymbol(symbol);
                System.out.print(word + "(" + topic + ") ");
            }
            System.out.println();
        }
    }

    static double binomialZ(double wordCountInDoc, double wordsInDoc, double wordCountinCorpus, double wordsInCorpus) {
        double pCorpus = wordCountinCorpus / wordsInCorpus;
        double var = wordsInCorpus * pCorpus * (1 - pCorpus);
        double dev = Math.sqrt(var);
        double expected = wordsInDoc * pCorpus;
        double z = (wordCountInDoc - expected) / dev;
        return z;
    }

}